{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Parser for Question-Answer Dataset\n",
    "This notebook parses the corpus of manually-generated factoid questions from Wikipedia articles. \n",
    "\n",
    "**Acknowledgments**\n",
    "http://www.cs.cmu.edu/~ark/QA-data/\n",
    "\n",
    "These data were collected by Noah Smith, Michael Heilman, Rebecca Hwa, Shay Cohen, Kevin Gimpel, and many students at Carnegie Mellon University and the University of Pittsburgh between 2008 and 2010.\n",
    " \n",
    "Their research project was supported by NSF IIS-0713265 (to Smith), an NSF Graduate Research Fellowship (to Heilman), NSF IIS-0712810 and IIS-0745914 (to Hwa), and Institute of Education Sciences, U.S. Department of Education R305B040063 (to Carnegie Mellon)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/data/datasets/cmu_wiki_qa/cmu_parser.ipynb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# uncomment and run below lines to set up if running in colab\n",
    "# !git clone https://github.com/LAION-AI/Open-Assistant.git\n",
    "# %cd Open-Assistant/data/datasets/cmu_wiki_qa/\n",
    "# !pip install -r requirements.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# download the dataset\n",
    "\n",
    "import requests\n",
    "import tarfile\n",
    "\n",
    "with requests.get(\"http://www.cs.cmu.edu/~ark/QA-data/data/Question_Answer_Dataset_v1.2.tar.gz\", stream=True) as resp:\n",
    "    with tarfile.open(fileobj=resp.raw) as f:\n",
    "        f.extractall()  # extract to ./Question_Answer_Dataset_v1.2\n",
    "\n",
    "print(\"Done.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# global settings\n",
    "\n",
    "FOLDER = \"Question_Answer_Dataset_v1.2\"  # input folder containing subfolders with Q&A csv files\n",
    "CSV = \"question_answer_pairs.txt\"  # csv files to look for in these subfolders\n",
    "SOURCE = \"wikipedia/cmu_qa\"  # source to use in the parquet for each row\n",
    "MUST_INCLUDE_ENTITY = True  # questions must include a reference to the article\n",
    "MUST_BE_QUALITY = True  # only quality answers are accepted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import io\n",
    "import re\n",
    "import json\n",
    "\n",
    "from tqdm import tqdm\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ArticleTitle</th>\n",
       "      <th>Question</th>\n",
       "      <th>Answer</th>\n",
       "      <th>DifficultyFromQuestioner</th>\n",
       "      <th>DifficultyFromAnswerer</th>\n",
       "      <th>ArticleFile</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Abraham_Lincoln</td>\n",
       "      <td>Was Abraham Lincoln the sixteenth President of...</td>\n",
       "      <td>yes</td>\n",
       "      <td>easy</td>\n",
       "      <td>easy</td>\n",
       "      <td>data/set3/a4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Abraham_Lincoln</td>\n",
       "      <td>Was Abraham Lincoln the sixteenth President of...</td>\n",
       "      <td>Yes.</td>\n",
       "      <td>easy</td>\n",
       "      <td>easy</td>\n",
       "      <td>data/set3/a4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Abraham_Lincoln</td>\n",
       "      <td>Did Lincoln sign the National Banking Act of 1...</td>\n",
       "      <td>yes</td>\n",
       "      <td>easy</td>\n",
       "      <td>medium</td>\n",
       "      <td>data/set3/a4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Abraham_Lincoln</td>\n",
       "      <td>Did Lincoln sign the National Banking Act of 1...</td>\n",
       "      <td>Yes.</td>\n",
       "      <td>easy</td>\n",
       "      <td>easy</td>\n",
       "      <td>data/set3/a4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Abraham_Lincoln</td>\n",
       "      <td>Did his mother die of pneumonia?</td>\n",
       "      <td>no</td>\n",
       "      <td>easy</td>\n",
       "      <td>medium</td>\n",
       "      <td>data/set3/a4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      ArticleTitle                                           Question Answer  \\\n",
       "0  Abraham_Lincoln  Was Abraham Lincoln the sixteenth President of...    yes   \n",
       "1  Abraham_Lincoln  Was Abraham Lincoln the sixteenth President of...   Yes.   \n",
       "2  Abraham_Lincoln  Did Lincoln sign the National Banking Act of 1...    yes   \n",
       "3  Abraham_Lincoln  Did Lincoln sign the National Banking Act of 1...   Yes.   \n",
       "4  Abraham_Lincoln                   Did his mother die of pneumonia?     no   \n",
       "\n",
       "  DifficultyFromQuestioner DifficultyFromAnswerer   ArticleFile  \n",
       "0                     easy                   easy  data/set3/a4  \n",
       "1                     easy                   easy  data/set3/a4  \n",
       "2                     easy                 medium  data/set3/a4  \n",
       "3                     easy                   easy  data/set3/a4  \n",
       "4                     easy                 medium  data/set3/a4  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get all files and concatenate them into a single df\n",
    "data = None\n",
    "for file in os.listdir(FOLDER):\n",
    "    if os.path.isdir(os.path.join(FOLDER, file)):\n",
    "        for match in os.listdir(os.path.join(FOLDER, file)):\n",
    "            if match.strip() == CSV:\n",
    "                match = os.path.join(FOLDER, file, match)\n",
    "                with open(match, \"r\", encoding=\"ISO-8859-1\") as m:\n",
    "                    raw = m.read().encode(\"utf-8\").decode(\"utf-8\")\n",
    "                data = pd.concat([data, pd.read_csv(io.StringIO(raw), sep=\"\\t\")])\n",
    "data.drop_duplicates()\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████| 109/109 [00:02<00:00, 46.44it/s]\n"
     ]
    }
   ],
   "source": [
    "# clean up the df, remove duplicates and answers that are way too short, etc.\n",
    "clean = {col: [] for col in [\"INSTRUCTION\", \"RESPONSE\", \"SOURCE\", \"METADATA\"]}\n",
    "\n",
    "for name, group in tqdm(data.groupby(\"ArticleTitle\")):\n",
    "    entity = \"|\".join([re.escape(item) for item in name.lower().strip().split(\"_\")])\n",
    "    for question, qgroup in group.groupby(\"Question\"):\n",
    "        # see if the question includes the article title (instead of referring to it, such as he, she, it, etc.)\n",
    "        if MUST_INCLUDE_ENTITY and not re.findall(r\"(?i)\\b(\" + entity + r\").?\", question):\n",
    "            continue\n",
    "        # use the longest answer\n",
    "        qgroup.sort_values(by=\"Answer\", key=lambda x: x.str.len(), inplace=True, ascending=False)\n",
    "        for _, row in qgroup.iterrows():\n",
    "            if not row[\"Answer\"] or pd.isna(row[\"Answer\"]):\n",
    "                continue  # no answer given\n",
    "            quality = True\n",
    "            if re.findall(r\"(?i)^(yes|no)\\W*$\", str(row[\"Answer\"])):\n",
    "                quality = False  # yes / no answer\n",
    "            elif len(row[\"Answer\"]) < 4:\n",
    "                quality = False  # short answer\n",
    "            elif len(row[\"Answer\"].split()) < 2:\n",
    "                quality = False  # one word answer\n",
    "            if not quality and MUST_BE_QUALITY:\n",
    "                continue  # skip low quality\n",
    "\n",
    "            clean[\"INSTRUCTION\"].append(question.strip())\n",
    "            clean[\"RESPONSE\"].append(row[\"Answer\"])\n",
    "            clean[\"SOURCE\"].append(SOURCE)\n",
    "            clean[\"METADATA\"].append(\n",
    "                json.dumps(\n",
    "                    {\n",
    "                        \"article_title\": name,\n",
    "                        \"article_file\": row[\"ArticleFile\"],\n",
    "                        \"difficulty_questioner\": row[\"DifficultyFromQuestioner\"],\n",
    "                        \"diffculty_answerer\": row[\"DifficultyFromAnswerer\"],\n",
    "                        # \"quality\": quality,\n",
    "                    }\n",
    "                )\n",
    "            )\n",
    "            break  # include only one answer for each question"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>INSTRUCTION</th>\n",
       "      <th>RESPONSE</th>\n",
       "      <th>SOURCE</th>\n",
       "      <th>METADATA</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>How long was Lincoln's legal Career?</td>\n",
       "      <td>23 years</td>\n",
       "      <td>wikipedia/cmu_qa</td>\n",
       "      <td>{\"article_title\": \"Abraham_Lincoln\", \"article_...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>How many long was Lincoln's formal education?</td>\n",
       "      <td>18 months.</td>\n",
       "      <td>wikipedia/cmu_qa</td>\n",
       "      <td>{\"article_title\": \"Abraham_Lincoln\", \"article_...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>What trail did Lincoln use a Farmers' Almanac in?</td>\n",
       "      <td>he defended William \"Duff\" Armstrong</td>\n",
       "      <td>wikipedia/cmu_qa</td>\n",
       "      <td>{\"article_title\": \"Abraham_Lincoln\", \"article_...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>When did Lincoln first serve as President?</td>\n",
       "      <td>March 4, 1861</td>\n",
       "      <td>wikipedia/cmu_qa</td>\n",
       "      <td>{\"article_title\": \"Abraham_Lincoln\", \"article_...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Which county was Lincoln born in?</td>\n",
       "      <td>Southeast Hardin County, Kentucky</td>\n",
       "      <td>wikipedia/cmu_qa</td>\n",
       "      <td>{\"article_title\": \"Abraham_Lincoln\", \"article_...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         INSTRUCTION  \\\n",
       "0               How long was Lincoln's legal Career?   \n",
       "1      How many long was Lincoln's formal education?   \n",
       "2  What trail did Lincoln use a Farmers' Almanac in?   \n",
       "3         When did Lincoln first serve as President?   \n",
       "4                  Which county was Lincoln born in?   \n",
       "\n",
       "                               RESPONSE            SOURCE  \\\n",
       "0                              23 years  wikipedia/cmu_qa   \n",
       "1                            18 months.  wikipedia/cmu_qa   \n",
       "2  he defended William \"Duff\" Armstrong  wikipedia/cmu_qa   \n",
       "3                         March 4, 1861  wikipedia/cmu_qa   \n",
       "4     Southeast Hardin County, Kentucky  wikipedia/cmu_qa   \n",
       "\n",
       "                                            METADATA  \n",
       "0  {\"article_title\": \"Abraham_Lincoln\", \"article_...  \n",
       "1  {\"article_title\": \"Abraham_Lincoln\", \"article_...  \n",
       "2  {\"article_title\": \"Abraham_Lincoln\", \"article_...  \n",
       "3  {\"article_title\": \"Abraham_Lincoln\", \"article_...  \n",
       "4  {\"article_title\": \"Abraham_Lincoln\", \"article_...  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# remove accidental duplicates\n",
    "clean = pd.DataFrame(clean)\n",
    "clean.sort_values(by=\"RESPONSE\", key=lambda x: x.str.len(), inplace=True, ascending=False)\n",
    "clean.drop_duplicates(subset=[\"INSTRUCTION\"], inplace=True)\n",
    "clean.sort_index(inplace=True)\n",
    "clean.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Retrieved 18.71% of all questions (748)\n"
     ]
    }
   ],
   "source": [
    "print(f\"Retrieved {len(clean) / len(data) * 100.:.2f}% of all questions ({len(clean)})\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
